This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#install.packages("data.table")
#install.packages("tm")
#update.packages("tm", checkBuilt = TRUE)
#install.packages("SnowballC")
#install.packages("rsconnect")
#install.packages("dplyr")
#install.packages("tidytext")
#install.packages("mldr")
#install.packages("Hmisc")
#install.packages("ggplot2")
#install.packages("wordcloud")
#install.packages("RColorBrewer")
#install.packages("stringr")
#install.packages("xgboost")
#install.packages("DT")
#install.packages("dplyr")
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DT)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(stringr)
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library(ggplot2)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(mldr)
##
## Attaching package: 'mldr'
## The following objects are masked from 'package:caret':
##
## precision, recall
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
library(rsconnect)
## Warning: package 'rsconnect' was built under R version 3.4.4
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.4
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
importing of training and test data
setwd("/Users/teenathampan/CapstoneProject/Project/")
traindata = fread("train.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")
testdata = fread("test.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")
# marking data as train and test data as well as creating dummy class columns in order to clean comments at the same time
traindata$type <- "train"
testdata$toxic<- 0
testdata$severe_toxic<- 0
testdata$obscene<- 0
testdata$threat<- 0
testdata$insult<- 0
testdata$identity_hate<- 0
testdata$type<- "test"
dataset <- rbind(traindata, testdata)
str(dataset)
## Classes 'data.table' and 'data.frame': 312735 obs. of 9 variables:
## $ id : chr "0000997932d777bf" "000103f0d9cfb60f" "000113f07ec002fd" "0001b41b1c6bb37e" ...
## $ comment_text : chr "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms"| __truncated__ "D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)" "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information"| __truncated__ "\"\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be l"| __truncated__ ...
## $ toxic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ severe_toxic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ obscene : num 0 0 0 0 0 0 1 0 0 0 ...
## $ threat : num 0 0 0 0 0 0 0 0 0 0 ...
## $ insult : num 0 0 0 0 0 0 1 0 0 0 ...
## $ identity_hate: num 0 0 0 0 0 0 0 0 0 0 ...
## $ type : chr "train" "train" "train" "train" ...
## - attr(*, ".internal.selfref")=<externalptr>
# function to clean comment field
clean_text<- function(text){
# text to lower case
text <- tolower(text)
# remove linebreaks
text<- gsub("\n", " ", text)
# remove extra white spaces to one space
text<- gsub("\\s+", " ", text)
# transform short forms
text<- gsub("'ll", " will", text)
text<- gsub("i'm", "i am", text)
text<- gsub("'re", " are", text)
text<- gsub("'s", " is", text)
text<- gsub("'ve", " have", text)
text<- gsub("'d", " would", text)
text<- gsub("can't", "can not", text)
text<- gsub("don't", "do not", text)
text<- gsub("doesn't", "does not", text)
text<- gsub("isn't", "is not", text)
text<- gsub("aren't", "are not", text)
text<- gsub("couldn't", "could not", text)
text<- gsub("mustn't", "must not", text)
text<- gsub("didn't", "did not", text)
text<- gsub("weren't", "were not", text)
# remove incorrect text
text<- gsub("f+u+c+k+\\b", "fuck", text)
# remove graphics
text<- gsub("[^[:graph:]]", " ", text)
# remove punctuation
text<- gsub("[[:punct:]]", " ", text)
# remove digits
text<- gsub("[[:digit:]]", " ", text)
# strip multiple whitspace to one
text<- gsub("\\s+", " ", text)
# remove "shittext"
text <- gsub("\\b(a|e)w+\\b", "AWWWW", text)
text <- gsub("\\b(y)a+\\b", "YAAAA", text)
text <- gsub("\\b(w)w+\\b", "WWWWW", text)
text <- gsub("\\b((l+)(a+))+\\b", "LALALA", text)
text <- gsub("(w+)(o+)(h+)(o+)", "WOHOO", text)
text <- gsub("\\b(d?(u+)(n+)?(h+))\\b", "UUUHHH", text)
text <- gsub("\\b(a+)(r+)(g+)(h+)\\b", "ARGH", text)
text <- gsub("\\b(a+)(w+)(h+)\\b", "AAAWWHH", text)
text <- gsub("\\b(p+)(s+)(h+)\\b", "SHHHHH", text)
text <- gsub("\\b((s+)(e+)?(h+))+\\b", "SHHHHH", text)
text <- gsub("\\b(s+)(o+)\\b", "", text)
text <- gsub("\\b(h+)(m+)\\b", "HHMM", text)
text <- gsub("\\b((b+)(l+)(a+)(h+)?)+\\b", "BLABLA", text)
text <- gsub("\\b((y+)(e+)(a+)(h+)?)+\\b", "YEAH", text)
text <- gsub("\\b((z+)?(o+)(m+)(f+)?(g+))+\\b", "OMG", text)
text <- gsub("aa(a+)", "a", text)
text <- gsub("ee(e+)", "e", text)
text <- gsub("i(i+)", "i", text)
text <- gsub("oo(o+)", "o", text)
text <- gsub("uu(u+)", "u", text)
text <- gsub("\\b(u(u+))\\b", "u", text)
text <- gsub("y(y+)", "y", text)
text <- gsub("hh(h+)", "h", text)
text <- gsub("gg(g+)", "g", text)
text <- gsub("tt(t+)\\b", "t", text)
text <- gsub("(tt(t+))", "tt", text)
text <- gsub("mm(m+)", "m", text)
text <- gsub("ff(f+)", "f", text)
text <- gsub("cc(c+)", "c", text)
text <- gsub("\\b(kkk)\\b", "KKK", text)
text <- gsub("\\b(pkk)\\b", "PKK", text)
text <- gsub("kk(k+)", "kk", text)
text <- gsub("fukk", "fuck", text)
text <- gsub("k(k+)\\b", "k", text)
text <- gsub("f+u+c+k+\\b", "fuck", text)
text <- gsub("((a+)|(h+)){3,}", "HAHEHI", text)
text <- gsub("mothjer", "mother", text)
text <- gsub("wikipedia ", " ", text)
text <- gsub("wiki ", " ", text)
#remove non ascii words
text <- gsub("[^\x20-\x7e]+", " ", text)
# remove stopwords
otherstopwords<- c("can", "will", "don", "now", "just", "also", "may", "get", "well", "need", "say", "way", "want", "see", "read", "look", "stop", "like", "really", "however", "let", "ask", "used", "made", "much", "utc", "added", "didn", "sure", "put", "better", "using", "tell", "anything", "one", "two", "wiki", "wikipedia", "first", "second", "however", "hahehi", "peopl", "talk", "page", "edit", "articl", "user", "make", "put", "far", "bit", "well", "still", "much", "one", "two", "don", "now", "even", "article", "articles", "edit", "edits", "page", "pages","talk", "editor", "ax", "edu", "subject", "lines", "like", "likes", "line","uh", "oh", "also", "get", "just", "hi", "hello", "ok", "ja", "editing", "edited","dont", "wikipedia", "hey", "however", "id", "yeah", "yo", "use", "need", "take", "give", "say", "user", "day", "want", "tell", "even", "look", "one", "make", "come", "see", "said", "now", "know", "talk", "read", "time", "sentence", "ain't", "wow", "image", "jpg", "copyright","wikiproject", "background color", "align", "px", "pixel",
"org", "com", "en", "ip", "ip address", "http", "www", "html", "htm",
"wikimedia", "https", "httpimg", "url", "urls", "utc", "uhm",
"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
"you", "your", "yours", "yourself", "yourselves",
"he", "him", "his", "himself",
"she", "her", "hers", "herself",
"it", "its", "itself",
"they", "them", "their", "theirs", "themselves",
"i'm", "you're", "he's", "i've", "you've", "we've", "we're",
"she's", "it's", "they're", "they've",
"i'd", "you'd", "he'd", "she'd", "we'd", "they'd",
"i'll", "you'll", "he'll", "she'll", "we'll", "they'll",
"what", "which", "who", "whom", "this", "that", "these", "those",
"am", "can", "will", "not",
"is", "was", "were", "have", "has", "had", "having", "wasn't", "weren't", "hasn't",
"are", "cannot", "isn't", "aren't", "doesn't", "don't", "can't", "couldn't", "mustn't", "didn't",
"haven't", "hadn't", "won't", "wouldn't",
"do", "does", "did", "doing", "would", "should", "could",
"be", "been", "being", "ought", "shan't", "shouldn't", "let's", "that's", "who's", "what's", "here's",
"there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if",
"or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
"between", "into", "through", "during", "before", "after", "above", "below", "to", "from",
"up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once",
"here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
"most", "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than",
"too", "very")
text <- removeWords(text, stopwords("en"))
text <- removeWords(text, otherstopwords)
return(unname(text))
}
cleaned dataset
dataset$cleanedtext <- clean_text(dataset$comment_text)
# separate out training and test data
train <-subset(dataset, type=="train",select=c(-comment_text, -type))
test<-subset(dataset, type=="test",select=c(id, cleanedtext))
exploratory analysis - toxic, severe_toxic, obscene, threat, insult, identity_hate / toxic, obscene, insult are thee classes that go together
summary(train)
## id toxic severe_toxic obscene
## Length:159571 Min. :0.00000 Min. :0.000000 Min. :0.00000
## Class :character 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Mode :character Median :0.00000 Median :0.000000 Median :0.00000
## Mean :0.09584 Mean :0.009996 Mean :0.05295
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.000000 Max. :1.00000
## threat insult identity_hate
## Min. :0.000000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.000000 Median :0.00000 Median :0.000000
## Mean :0.002995 Mean :0.04936 Mean :0.008805
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.00000 Max. :1.000000
## cleanedtext
## Length:159571
## Class :character
## Mode :character
##
##
##
str(train)
## Classes 'data.table' and 'data.frame': 159571 obs. of 8 variables:
## $ id : chr "0000997932d777bf" "000103f0d9cfb60f" "000113f07ec002fd" "0001b41b1c6bb37e" ...
## $ toxic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ severe_toxic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ obscene : num 0 0 0 0 0 0 1 0 0 0 ...
## $ threat : num 0 0 0 0 0 0 0 0 0 0 ...
## $ insult : num 0 0 0 0 0 0 1 0 0 0 ...
## $ identity_hate: num 0 0 0 0 0 0 0 0 0 0 ...
## $ cleanedtext : chr "explanation username hardcore metallica fan reverted vandalisms closure gas voted new york doll"| __truncated__ "d AWWWW matches background colour seemingly stuck thanks january " " man trying war guy constantly removing relevant information talking instead seems care "| __truncated__ " real suggestions improvement wondered section statistics later subsection types accidents th"| __truncated__ ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(train)
## [1] 159571 8
colSums(sapply(train, is.na))
## id toxic severe_toxic obscene threat
## 0 0 0 0 0
## insult identity_hate cleanedtext
## 0 0 0
# plotting number of documents with each toxicity category
train2<-train[,c(2:7)]
x<-barplot(colSums(train2), ylim = c(0,20000), xlab="type of toxicity", ylab="frequency", main ="Frequency of each toxicity level")
y<-as.matrix(colSums(train2))
text(x, y, labels=as.character(y), pos = 3, cex=1)
train$toxcount<-rowSums(train[,2:7])
# plotting number of documents with the multiple labels per document
Num_Class<-table(train$toxcount)
tox_class<-as.data.frame(Num_Class, row.names = NULL, responseName = "Num_Doc", sep=" ")
colnames(tox_class)[colnames(tox_class)=="Var1"] <- "Num_of_classes"
c1<-ggplot(data = subset(tox_class, Num_of_classes!=0), aes(x=Num_of_classes, y=Num_Doc)) + geom_bar(stat="identity")+ggtitle("Document frequency for multi labels")+geom_text(aes(label = Num_Doc), vjust = 1.5, color = "red")
ggsave("Document frequency for multi labels.png", width=297, height =210, units = "mm")
TDM, words and wordcloud for label toxic test
# load the data as a corpus
train_to <- train[toxic==1]$cleanedtext
train_to <- Corpus(VectorSource(train_to)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_to<-TermDocumentMatrix(train_to)
m_to<-as.matrix(tdm_to)
v_to<-sort(rowSums(m_to), decreasing=TRUE)
d_to<-data.frame(word = names(v_to), freq=v_to)
## generate wordcloud with layout to include title
set.seed(1234)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=.5, y=.1, "Toxic wordcloud")
wordcloud(words=d_to$word, freq=d_to$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
TDM, words and wordcloud for label severe_toxic test
# load the data as a corpus
train_st <- train[severe_toxic==1]$cleanedtext
train_st <- Corpus(VectorSource(train_st)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_st<-TermDocumentMatrix(train_st)
m_st<-as.matrix(tdm_st)
v_st<-sort(rowSums(m_st), decreasing=TRUE)
d_st<-data.frame(word = names(v_st), freq=v_st)
## generate wordcloud with layout to include title
set.seed(1234)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.1, "Severe Toxic wordcloud")
wordcloud(words=d_st$word, freq=d_st$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
TDM, words and wordcloud for label obscene test
# load the data as a corpus
train_o <- train[obscene==1]$cleanedtext
train_o <- Corpus(VectorSource(train_o)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_o<-TermDocumentMatrix(train_o)
m_o<-as.matrix(tdm_o)
v_o<-sort(rowSums(m_o), decreasing=TRUE)
d_o<-data.frame(word = names(v_o), freq=v_o)
## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.1, "Obscene wordcloud")
wordcloud(words=d_o$word, freq=d_o$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
TDM, words and wordcloud for label threat test
# load the data as a corpus
train_t <- train[threat==1]$cleanedtext
train_t <- Corpus(VectorSource(train_t)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_t<-TermDocumentMatrix(train_t)
m_t<-as.matrix(tdm_t)
v_t<-sort(rowSums(m_t), decreasing=TRUE)
d_t<-data.frame(word = names(v_t), freq=v_t)
## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=0.1, "Threat wordcloud")
wordcloud(words=d_t$word, freq=d_t$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
TDM, words and wordcloud for label insult test
# load the data as a corpus
train_i <- train[insult==1]$cleanedtext
train_i <- Corpus(VectorSource(train_i)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_i<-TermDocumentMatrix(train_i)
m_i<-as.matrix(tdm_i)
v_i<-sort(rowSums(m_i), decreasing=TRUE)
d_i<-data.frame(word = names(v_i), freq=v_i)
## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.5, "Insult wordcloud")
wordcloud(words=d_i$word, freq=d_i$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
TDM, words and wordcloud for label identity_hate test
traindata_ih<-subset(traindata, identity_hate==1)
# load the data as a corpus
train_ih <- train[identity_hate==1]$cleanedtext
train_ih <- Corpus(VectorSource(train_ih)) %>%
tm_map(stemDocument)
## build a term document matrix
tdm_ih<-TermDocumentMatrix(train_ih)
m_ih<-as.matrix(tdm_ih)
v_ih<-sort(rowSums(m_ih), decreasing=TRUE)
d_ih<-data.frame(word = names(v_ih), freq=v_ih)
## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.5, "Identity Hate wordcloud")
wordcloud(words=d_ih$word, freq=d_ih$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
The sentences are broken up into words.
trainWords <- train %>%
unnest_tokens(word, cleanedtext) %>%
count(toxic,severe_toxic,obscene,threat,insult,identity_hate,word) %>%
ungroup()
datatable(head(trainWords,20), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
The combinations of toxic,severe toxic,obscene,threat,insult and identity hate will create unique categories. We will display those categories here.
trainWords <- train %>%
unnest_tokens(word, cleanedtext) %>%
count(toxic,severe_toxic,obscene,threat,insult,identity_hate,word) %>%
ungroup()
total_words <- trainWords %>%
group_by(toxic,severe_toxic,obscene,threat,insult,identity_hate) %>%
summarise(total = sum(n))
total_words
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
Value = TF * IDF
Here using TF-IDF , we investigate the Twenty Most Important words
Category =1:41
fillColor = "#8db600"
fillColor2 = "#ffbf00"
total_words$Category = Category
trainWords <- left_join(trainWords, total_words)
#Now we are ready to use the bind_tf_idf which computes the tf-idf for each term.
trainWords <- trainWords %>%
bind_tf_idf(word, Category, n)
plot_trainWords <- trainWords %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
plot_trainWords %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor) +
labs(x = NULL, y = "tf-idf", title = "TDIF for all words") +
coord_flip() +
theme_bw()
We plot the TF-IDF for the Toxic Comments
plot_trainWords %>%
filter(toxic == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title = "TDIF for Toxic labelled comments") +
coord_flip() +
theme_bw()
We plot the TF-IDF for the Severe Toxic Comments
plot_trainWords %>%
filter(severe_toxic == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title = "TDIF for Severe Toxic labelled comments") +
coord_flip() +
theme_bw()
We plot the TF-IDF for the Obscene Comments
plot_trainWords %>%
filter(obscene == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title = "TDIF for obscene labelled comments") +
coord_flip() +
theme_bw()
We plot the TF-IDF for the Threat Comments
plot_trainWords %>%
filter(threat == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title="TDIF for threat labelled comments") +
coord_flip()
theme_bw()
## List of 57
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 5.5 0 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 5.5 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 5.5 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 0 5.5
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size :Class 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 2.2 0 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 2.2 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 2.2 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 0 2.2
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks :List of 6
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ axis.ticks.length :Class 'unit' atomic [1:1] 2.75
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.y : NULL
## $ legend.background :List of 5
## ..$ fill : NULL
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.margin :Classes 'margin', 'unit' atomic [1:4] 0.2 0.2 0.2 0.2
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## $ legend.spacing :Class 'unit' atomic [1:1] 0.4
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.key.size :Class 'unit' atomic [1:1] 1.2
## .. ..- attr(*, "valid.unit")= int 3
## .. ..- attr(*, "unit")= chr "lines"
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size :Class 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.align : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.align : NULL
## $ legend.position : chr "right"
## $ legend.direction : NULL
## $ legend.justification : chr "center"
## $ legend.box : NULL
## $ legend.box.margin :Classes 'margin', 'unit' atomic [1:4] 0 0 0 0
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## $ legend.box.background: list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing :Class 'unit' atomic [1:1] 0.4
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## $ panel.background :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.border :List of 5
## ..$ fill : logi NA
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.spacing :Class 'unit' atomic [1:1] 5.5
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## $ panel.spacing.x : NULL
## $ panel.spacing.y : NULL
## $ panel.grid.major :List of 6
## ..$ colour : chr "grey92"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.minor :List of 6
## ..$ colour : chr "grey92"
## ..$ size : num 0.25
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.ontop : logi FALSE
## $ plot.background :List of 5
## ..$ fill : NULL
## ..$ colour : chr "white"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size :Class 'rel' num 1.2
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 6.6 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.subtitle :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size :Class 'rel' num 0.9
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 0 4.95 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size :Class 'rel' num 0.9
## ..$ hjust : num 1
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 4.95 0 0 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.margin :Classes 'margin', 'unit' atomic [1:4] 5.5 5.5 5.5 5.5
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## $ strip.background :List of 5
## ..$ fill : chr "grey85"
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ strip.placement : chr "inside"
## $ strip.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey10"
## ..$ size :Class 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 5.5 0 5.5 0
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin :Classes 'margin', 'unit' atomic [1:4] 0 5.5 0 5.5
## .. .. ..- attr(*, "valid.unit")= int 8
## .. .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.switch.pad.grid:Class 'unit' atomic [1:1] 0.1
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## $ strip.switch.pad.wrap:Class 'unit' atomic [1:1] 0.1
## .. ..- attr(*, "valid.unit")= int 1
## .. ..- attr(*, "unit")= chr "cm"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
We plot the TF-IDF for the Insult Comments
plot_trainWords %>%
filter(insult == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title = "TDIF for Insult labelled comments") +
coord_flip() +
theme_bw()
We plot the TF-IDF for the Identity hate Comments
plot_trainWords %>%
filter(identity_hate == 1 ) %>%
top_n(20) %>%
ggplot(aes(word, tf_idf)) +
geom_col(fill = fillColor2) +
labs(x = NULL, y = "tf-idf", title = "TDIF for Identity Hate labelled comments") +
coord_flip() +
theme_bw()
corpus = VCorpus(VectorSource(train$cleanedtext))
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.99)
dataset = as.data.frame(as.matrix(dtm))
corpus = VCorpus(VectorSource(test$cleanedtext))%>%
tm_map(stemDocument)
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.99)
datasetTest = as.data.frame(as.matrix(dtm))
## brining all the common words together from train and text
colnamesSame = intersect(colnames(dataset),colnames(datasetTest))
dataset = dataset[ , (colnames(dataset) %in% colnamesSame)]
datasetTest = datasetTest[ , (colnames(datasetTest) %in% colnamesSame)]
#partitioning data
# inspect(dtm[20:30, 50:75])
# <<DocumentTermMatrix (documents: 11, terms: 26)>>
# Non-/sparse entries: 1/285
# Sparsity : 100%
# Maximal term length: 9
# Weighting : term frequency (tf)
# Sample :
# Terms
# Docs book call came care case categori caus certain chang check
# 20 0 0 0 0 0 0 0 0 0 0
# 21 0 0 0 0 0 0 0 0 0 0
# 22 0 0 0 0 0 0 0 0 0 0
# 23 0 0 0 0 0 0 0 0 0 0
# 24 0 0 0 0 0 0 1 0 0 0
# 25 0 0 0 0 0 0 0 0 0 0
# 26 0 0 0 0 0 0 0 0 0 0
# 27 0 0 0 0 0 0 0 0 0 0
# 28 0 0 0 0 0 0 0 0 0 0
# 29 0 0 0 0 0 0 0 0 0 0
# cross validation of Toxic training data
set.seed(17)
train_index <- sample(1:nrow(dataset), nrow(dataset)*.75)
#labels for train and test
toxic <- train[train_index,]$toxic
part_test_toxic <- train[-train_index,]$toxic
part_train <- cbind(dataset[train_index,],toxic)
part_train$toxic = as.factor(part_train$toxic)
levels(part_train$toxic) = make.names(unique(part_train$toxic))
part_test <- dataset[-train_index,]
formula = toxic ~ .
#fitControl <- trainControl(method="none",classProbs=TRUE, summaryFunction=twoClassSummary)
fitControl <- trainControl(method="cv", number = 3, returnResamp = "all", classProbs=TRUE, summaryFunction=twoClassSummary, allowParallel = T)
#fitControl <- trainControl(method="repeatedcv", number = 10, repeats = 3, classProbs=TRUE, summaryFunction=twoClassSummary)
xgbGrid <- expand.grid(nrounds = 500,
eta = .3,
max_depth = 6,
gamma = 0,
colsample_bytree = .8,
min_child_weight = 1,
subsample = 1)
set.seed(13)
start_time <-Sys.time()
ToxicXGBt = train(formula, data = part_train,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize = FALSE)
ToxicXGBt
## eXtreme Gradient Boosting
##
## 119678 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 79785, 79785, 79786
## Resampling results:
##
## ROC Sens Spec
## 0.7744686 0.9972838 0.1701927
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.469419 mins
predToxict = predict(ToxicXGBt,part_test,type ='prob')
predToxict.resp <- ifelse(predToxict$X1 >= 0.80, 1, 0)
confusionMatrix(predToxict.resp,part_test_toxic, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36022 3232
## 1 17 622
##
## Accuracy : 0.9186
## 95% CI : (0.9158, 0.9212)
## No Information Rate : 0.9034
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2564
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.16139
## Specificity : 0.99953
## Pos Pred Value : 0.97340
## Neg Pred Value : 0.91766
## Prevalence : 0.09661
## Detection Rate : 0.01559
## Detection Prevalence : 0.01602
## Balanced Accuracy : 0.58046
##
## 'Positive' Class : 1
##
#####################################################################################################
# cross validation of Severe Toxic training data
set.seed(17)
#labels for train and test
severe_toxic <- train[train_index,]$severe_toxic
part_test_severe_toxic <- train[-train_index,]$severe_toxic
part_train <- cbind(dataset[train_index,],severe_toxic)
part_train$severe_toxic = as.factor(part_train$severe_toxic)
levels(part_train$severe_toxic) = make.names(unique(part_train$severe_toxic))
part_test <- dataset[-train_index,]
formula = severe_toxic ~ .
set.seed(13)
start_time <-Sys.time()
SToxicXGBt = train(formula, data = part_train,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
SToxicXGBt
## eXtreme Gradient Boosting
##
## 119678 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 79785, 79785, 79786
## Resampling results:
##
## ROC Sens Spec
## 0.8726972 0.9988945 0.1149607
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.600067 mins
predSToxict = predict(SToxicXGBt,part_test,type ='prob')
predSToxict.resp <- ifelse(predSToxict$X1 >= 0.80, 1, 0)
confusionMatrix(predSToxict.resp,part_test_severe_toxic, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 39474 404
## 1 7 8
##
## Accuracy : 0.9897
## 95% CI : (0.9887, 0.9907)
## No Information Rate : 0.9897
## P-Value [Acc > NIR] : 0.4933
##
## Kappa : 0.0368
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0194175
## Specificity : 0.9998227
## Pos Pred Value : 0.5333333
## Neg Pred Value : 0.9898691
## Prevalence : 0.0103276
## Detection Rate : 0.0002005
## Detection Prevalence : 0.0003760
## Balanced Accuracy : 0.5096201
##
## 'Positive' Class : 1
##
#####################################################################################################
# cross validation of Obscene training data
set.seed(17)
#labels for train and test
obscene <- train[train_index,]$obscene
part_test_obscene <- train[-train_index,]$obscene
part_train <- cbind(dataset[train_index,],obscene)
part_train$obscene = as.factor(part_train$obscene)
levels(part_train$obscene) = make.names(unique(part_train$obscene))
part_test <- dataset[-train_index,]
formula = obscene ~ .
set.seed(13)
start_time <-Sys.time()
ObsceneXGBt = train(formula, data = part_train,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
ObsceneXGBt
## eXtreme Gradient Boosting
##
## 119678 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 79786, 79785, 79785
## Resampling results:
##
## ROC Sens Spec
## 0.8083263 0.9988089 0.2764882
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.3436 mins
predObscenet = predict(ObsceneXGBt,part_test,type ='prob')
predObscenet.resp <- ifelse(predObscenet$X1 >= 0.80, 1, 0)
confusionMatrix(predObscenet.resp,part_test_obscene, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37746 1540
## 1 31 576
##
## Accuracy : 0.9606
## 95% CI : (0.9587, 0.9625)
## No Information Rate : 0.947
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4091
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.27221
## Specificity : 0.99918
## Pos Pred Value : 0.94893
## Neg Pred Value : 0.96080
## Prevalence : 0.05304
## Detection Rate : 0.01444
## Detection Prevalence : 0.01522
## Balanced Accuracy : 0.63570
##
## 'Positive' Class : 1
##
#####################################################################################################
# cross validation of Insult training data
set.seed(17)
#labels for train and test
insult <- train[train_index,]$insult
part_test_insult <- train[-train_index,]$insult
part_train <- cbind(dataset[train_index,],insult)
part_train$insult = as.factor(part_train$insult)
levels(part_train$insult) = make.names(unique(part_train$insult))
part_test <- dataset[-train_index,]
formula = insult ~ .
set.seed(13)
start_time <-Sys.time()
InsultXGBt = train(formula, data = part_train,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
InsultXGBt
## eXtreme Gradient Boosting
##
## 119678 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 79785, 79785, 79786
## Resampling results:
##
## ROC Sens Spec
## 0.7995579 0.9959042 0.2122995
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.225757 mins
predinsultt = predict(InsultXGBt,part_test,type ='prob')
predinsultt.resp <- ifelse(predinsultt$X1 >= 0.80, 1, 0)
confusionMatrix(predinsultt.resp,part_test_insult, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37884 1827
## 1 34 148
##
## Accuracy : 0.9534
## 95% CI : (0.9512, 0.9554)
## No Information Rate : 0.9505
## P-Value [Acc > NIR] : 0.004139
##
## Kappa : 0.13
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.074937
## Specificity : 0.999103
## Pos Pred Value : 0.813187
## Neg Pred Value : 0.953993
## Prevalence : 0.049507
## Detection Rate : 0.003710
## Detection Prevalence : 0.004562
## Balanced Accuracy : 0.537020
##
## 'Positive' Class : 1
##
#####################################################################################################
# cross validation of Identity Hate training data
set.seed(17)
#labels for train and test
identity_hate <- train[train_index,]$identity_hate
part_test_identity_hate <- train[-train_index,]$identity_hate
part_train <- cbind(dataset[train_index,],identity_hate)
part_train$identity_hate = as.factor(part_train$identity_hate)
levels(part_train$identity_hate) = make.names(unique(part_train$identity_hate))
part_test <- dataset[-train_index,]
formula = identity_hate ~ .
set.seed(13)
start_time <-Sys.time()
IHXGBt = train(formula, data = part_train,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
IHXGBt
## eXtreme Gradient Boosting
##
## 119678 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 79785, 79786, 79785
## Resampling results:
##
## ROC Sens Spec
## 0.7780075 0.9997808 0.006729856
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 4.883873 mins
predIHt = predict(IHXGBt,part_test,type ='prob')
predIHt.resp <- ifelse(predIHt$X1 >= 0.80, 1, 0)
confusionMatrix(predIHt.resp,part_test_identity_hate, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 39528 364
## 1 0 1
##
## Accuracy : 0.9909
## 95% CI : (0.9899, 0.9918)
## No Information Rate : 0.9909
## P-Value [Acc > NIR] : 0.4929
##
## Kappa : 0.0054
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 2.740e-03
## Specificity : 1.000e+00
## Pos Pred Value : 1.000e+00
## Neg Pred Value : 9.909e-01
## Prevalence : 9.149e-03
## Detection Rate : 2.507e-05
## Detection Prevalence : 2.507e-05
## Balanced Accuracy : 5.014e-01
##
## 'Positive' Class : 1
##
#####################################################################################################
We calculate the various targets and predict the probablities
# above to avoid package loading messages
dataset2 = dataset
dataset2$toxic = train$toxic
dataset2$toxic = as.factor(dataset2$toxic)
levels(dataset2$toxic) = make.names(unique(dataset2$toxic))
formula = toxic ~ .
# (nrounds = 500, #number of iterations
# eta = .05, #learning rate lies between 0.01 - 0.30
# max_depth = 3, #controls the depth of the tree default is 6 but range 0 - Inf
# gamma = 0, #controls regularization to prevent overfitting, default is 0
# lambda = 0, #controls L2 regularization on weight to prevent overfitting
# colsample_bytree = .8, #controls the number of features supplied to a tree lies between 0.5 - 0.9
# min_child_weight = 1, # leaf node has a min sum of instance wgt < min_child_wgt, the tree splitting stops.
# subsample = 1 # controls the number of samples supplied to the tree range 0 - 1 with default at 1
set.seed(13)
start_time <-Sys.time()
ToxicXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
ToxicXGB
## eXtreme Gradient Boosting
##
## 159571 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 106381, 106380, 106381
## Resampling results:
##
## ROC Sens Spec
## 0.7740541 0.9977751 0.1703936
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 6.189664 mins
predToxic = predict(ToxicXGB,datasetTest,type ='prob')
plot(predToxic)
#####################################################################################################
We calculate the various targets and predict the probablities
dataset2 = dataset
dataset2$severe_toxic = train$severe_toxic
dataset2$severe_toxic = as.factor(dataset2$severe_toxic)
levels(dataset2$severe_toxic) = make.names(unique(dataset2$severe_toxic))
formula = severe_toxic ~ .
set.seed(13)
start_time <- Sys.time()
SToxicXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
SToxicXGB
## eXtreme Gradient Boosting
##
## 159571 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 106381, 106381, 106380
## Resampling results:
##
## ROC Sens Spec
## 0.8693295 0.9989682 0.1097376
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.084725 mins
predSToxic = predict(SToxicXGB,datasetTest,type ='prob')
plot(predSToxic)
#####################################################################################################
We calculate the various targets and predict the probablities
dataset2 = dataset
dataset2$obscene = train$obscene
dataset2$obscene = as.factor(dataset2$obscene)
levels(dataset2$obscene) = make.names(unique(dataset2$obscene))
formula = obscene ~ .
set.seed(13)
start_time <- Sys.time()
ObsceneXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
ObsceneXGB
## eXtreme Gradient Boosting
##
## 159571 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 106380, 106381, 106381
## Resampling results:
##
## ROC Sens Spec
## 0.8134006 0.9987626 0.2784946
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.055944 mins
predObscene = predict(ObsceneXGB,datasetTest,type ='prob')
plot(predObscene)
#####################################################################################################
We calculate the various targets and predict the probablities
dataset2 = dataset
dataset2$threat = train$threat
dataset2$threat = as.factor(dataset2$threat)
levels(dataset2$threat) = make.names(unique(dataset2$threat))
formula = threat ~ .
set.seed(13)
start_time<-Sys.time()
ThreatXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.088708 mins
predThreat = predict(ThreatXGB,datasetTest,type ='prob')
plot(predThreat)
#####################################################################################################
We calculate the various targets and predict the probablities
dataset2 = dataset
dataset2$insult = train$insult
dataset2$insult = as.factor(dataset2$insult)
levels(dataset2$insult) = make.names(unique(dataset2$insult))
formula = insult ~ .
set.seed(13)
start_time <- Sys.time()
InsultXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
InsultXGB
## eXtreme Gradient Boosting
##
## 159571 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 106381, 106381, 106380
## Resampling results:
##
## ROC Sens Spec
## 0.8014859 0.996137 0.2132807
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.054872 mins
predInsult = predict(InsultXGB,datasetTest,type ='prob')
plot(predInsult)
#####################################################################################################
We calculate the various targets and predict the probablities
dataset2 = dataset
dataset2$identity_hate = train$identity_hate
dataset2$identity_hate = as.factor(dataset2$identity_hate)
levels(dataset2$identity_hate) = make.names(unique(dataset2$identity_hate))
formula = identity_hate ~ .
set.seed(13)
start_time <- Sys.time()
IHXGB = train(formula, data = dataset2,
method = "xgbTree",trControl = fitControl,
tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
IHXGB
## eXtreme Gradient Boosting
##
## 159571 samples
## 195 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 106380, 106381, 106381
## Resampling results:
##
## ROC Sens Spec
## 0.7838228 0.999804 0.009251666
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## 0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.075111 mins
predIH = predict(IHXGB,datasetTest,type ='prob')
plot(predIH)
#####################################################################################################
submission =testdata = fread("sample_submission.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")
submission$toxic = predToxic$X1
submission$severe_toxic = predSToxic$X1
submission$obscene = predObscene$X1
submission$threat = predThreat$X1
submission$insult = predInsult$X1
submission$identity_hate = predIH$X1
# Write it to file
write.csv(submission, 'ToxicCommentsMar312018.csv', row.names = F)